Python Data Analysis Library
datacamp:
pip install pandasimport pandas as pd
pd.__version__
# '1.1.0'a one-dimensional labeled array capable of holding any data type
pd.Series(data, index, dtype, copy)
pd.Series() # empty series, dtype float64 by default
# from scalar
pd.Series(0) # dtype int64, integer index by default
pd.Series(0, ['e0', 'e1', 'e2']) # broadcasts the scalar across all elemnts
# from list
pd.Series([1, 2, 3])
pd.Series([1, 2, 3], ['e0', 'e1', 'e2'])
# from dict
pd.Series({'e0': 1, 'e1': 2, 'e2': 3}) # keys as index by default
pd.Series({'e0': 1, 'e1': 2, 'e2': 3}, ['e0', 'e1']) # ignore e2
pd.Series({'e0': 1, 'e1': 2, 'e2': 3}, ['e0', 'e1', 'e2', 'e3']) # include e3 as NaN
# from ndarray
pd.Series(np.array([1, 2, 3]), ['e0', 'e1', 'e2']) Things to note:
objectNone gets converted to NaN for dtype float and dtype intNone stays as None for dtype objectser.shape
ser.count
ser.index
ser.index.name
len(ser)ser['e0'] - but it’s safer to be explicit and use ser.locser.loc['e0'] # returns a scalar (if the index value is unique)
ser.loc[['e0']]
ser.loc[['e0','e2']]
ser.loc[:'e1'] # up to and INCLUDING 'e1'
ser.loc['e1':] # from and including 'e1'ser[0] - but it’s safer to be explicit and use ser.ilocser.iloc[0] # returns a scalar
ser.iloc[-1] # returns a scalar
ser.iloc[[0]]
ser.iloc[[-1]]
ser.iloc[[0, 2]]
ser.iloc[:2] # up to and EXCLUDING position 2 (i.e. 0 and 1 only)
ser.iloc[2:] # from and including position 2 (i.e. 2 onward)# TODOconversion
list(x)
ser.to_list()
dict(x)
ser.to_dict()
ser.to_numpy()a two-dimensional labeled data structure with columns of potentially different types
pd.DataFrame(data, index, columns, dtype, copy)
# create empty df
df = pd.DataFrame()each key is a column
# from dict of lists - each key is a column
pd.DataFrame({'c1': ['x', 'y', 'z'], 'c2': [1, 2, 3]},
index=['r1', 'r2', 'r3'])
# from dict of series - each key is a column
pd.DataFrame({'c1': pd.Series(['x', 'y', 'z'], index=['r1', 'r2', 'r3']),
'c2': pd.Series([1, 2, 3], index=['r1', 'r2', 'r3'])})each element is a row
pd.DataFrame([1, 2, 3])
pd.DataFrame([1, 2, 3], columns=['c1'])
# from list of lists
pd.DataFrame([['x', 1],
['y', 2],
['z', 3]],
index=['r1', 'r2', 'r3'],
columns=['c1', 'c2'])
# from list of series
pd.DataFrame([pd.Series(['x', 1], ['c1', 'c2']),
pd.Series(['y', 2], ['c1', 'c2']),
pd.Series(['z', 3], ['c1', 'c2'])],
index=['r1', 'r2', 'r3'])
# from list of dicts
pd.DataFrame([{'c1': 'x', 'c2': 1},
{'c1': 'y', 'c2': 2},
{'c1': 'z', 'c2': 3}],
index=['r1', 'r2', 'r3'])df.shape # (nrows, ncols) tuple
df.head # first 5 rows
df.tail # last 5 rows
df.info
df.dtypes
df.columns
df.index
df.columns.values
df.columns.name # not always
df.columns.names # not always
df.index.values
df.index.name # not always
df.index.names # not always
len(df) # nrowse.g.
object
float64
int64
datetime
df['c1'] # returns a series
df[['c1']]
df[['c1', 'c2']]Equivalently (and less ambiguously),
df.loc[:, 'c1'] # returns a series
df.loc[:, ['c1']]
df.loc[:, ['c1', 'c2']]df.iloc[:, 0] # returns a series
df.iloc[:, [0]]
df.iloc[:, [0, 1]]df.loc['r1'] # first row - returns a series
df.loc['r3'] # last row - returns a series
df.loc[['r1']] # first row
df.loc[['r3']] # last rowdf.iloc[0] # first row - returns a series
df.iloc[-1] # last row - returns a series
df.iloc[[0]] # first row
df.iloc[[-1]] # last rowlist of booleans:
df.loc[:, [True, False]]
df.loc[:, [True, True]]boolean series:
# filter rows
df[df['c1'] == "x"]
df[(df['c1'] == "x") | (df['c1'] == "y")] # bitwise boolean OR operator
df.loc[df['c1'].isin(['x', 'y'])]# TODOx = df['c2'] # returns a view
x.to_numpy().flags.owndata # returns False
x[1] = 99999 # triggers a SettingWithCopyWarning warning
df # has been modified...y = df['c2'][['r1', 'r2', 'r3']] # returns a copy
y.to_numpy().flags.owndata # returns True
y[1] = -99999 # no warnings
df # has not been modifieddefaults:
* how='inner'
* on=None
* left_on=None
* right_on=None
* left_index=False
* right_index=False
pd.merge(df1, df2, on='c1', how='left')
pd.merge(df1, df2, on='c1', how='right')
pd.merge(df1, df2, on='c1', how='inner')
pd.merge(df1, df2, on='c1', how='outer')
# e.g. inner join on different column names
pd.merge(df1, df2, left_on='c1', right_on='c2')
# e.g. inner join on two column names
pd.merge(df1, df2, left_on=['c1', 'c2'], right_on=['c1', 'c2'])
# e.g. inner join on indexes
pd.merge(df1, df2, left_index=True, right_index=True)s1 = pd.Series([1,2,3])
s2 = pd.Series([4,5,6])
s1.append(s2) # does not modify in-place# e.g. apply a function element-wise
ser.apply(np.sqrt)
df.apply(np.sqrt)# e.g. apply a function row/column-wise
df.apply(np.sum) # column sums (i.e. apply across rows)
df.apply(np.sum, axis=1) # row sums (i.e. apply across columns)# e.g. list-like results
df.apply(lambda x: [1, 2], axis=1) # returns a list column
df.apply(lambda x: [1, 2], axis=1, result_type='expand') # expands results as columns# TODOdf['c1'] = df['c1'].astype(int)
df[["col1", "col2"]] = df[["col1", "col2"]].astype(float)calculate pair-wise correlations for numeric variables.
df.corr()
df[["col1", "col2"]].corr()labels = ['high', 'med', 'low']
bins = np.linspace(df['c1'].min(), df['c1'].max(), len(labels) + 1)
df['binned'] = pd.cut(df['c1'], bins, labels=labels, include_lowest=True)
from matplotlib import pyplot
pyplot.bar(labels, df["binned"].value_counts())df.describe() # numerical variables only
df.describe(include_all=True)defaults:
* axis=0
* inplace=False
df.drop('r1')
df.drop('c1', axis = 1)
df.drop('c1', axis = 1, inplace=True)defaults:
* axis=0
* inplace=False
* how='any'
df.dropna() # drop rows with any missing
df.dropna(subset=['c1', 'c2']) # drop rows with any missing in column subset
df.dropna(axis=1) # drop columns with any missing
df.dropna(axis=1, how='all') # drop columns with all missing
df.dropna(axis=1, how='all', inplace=True)# remove duplicate rows based on all columns - keep first occurrence
df.drop_duplicates()
# remove duplicate rows based on all columns - keep last occurrence
df.drop_duplicates(keep='last')
# remove duplicate rows based on all columns - remove all duplicates
df.drop_duplicates(keep=False)
# remove duplicate rows based on specific columns
df.drop_duplicates(subset=['c1', 'c2'])
# modify in-place
df.drop_duplicates(inplace=True)df.fillna(0)
df.fillna(method='bfill', axis=0) # fill using previous non-missing in row
df.fillna(method='ffill', axis=1) # fill using next non-missing in columndummies = pd.get_dummies(df['c1'])
df = pd.concat([df, dummies], axis=1)
df.drop('c1', axis = 1, inplace=True)
df# e.g. group by col1 and col2 and get the grouped means of col3 and col4
df=df[['c1', 'c2', 'c3', 'c3']]
df.groupby(['c1', 'c2'], as_index=False).mean()attributes:
grouped_df = df.groupby(['col'])
# list all grouping levels
list(grouped_df.groups.keys())
# group by col which has levels a and b
grouped_df.get_group("a")
grouped_df.get_group("b")# TODOimport pandas as pd
df = pd.DataFrame(dict(c1=[1,2,3]))
df
# c1
# 0 1
# 1 2
# 2 3
mapper = dict(zip([1,2,3], ['r1','r2','r3']))
df['c1'] = df['c1'].map(mapper)
df
# c1
# 0 r1
# 1 r2
# 2 r3# TODOI.e. pivot wider (spread)
# e.g. col1 as rownames, col2 as colnames, col3 as values
df=df[['c1', 'c2', 'c3']]
df.pivot(index='c1', columns='c2')a generalization of pivot that can handle duplicate values for one pivoted index/column pair
can supply a list of aggregation functions using aggfunc argument (default is np.mean).
can handle multiple columns for the index and column of the pivoted table (in which case a hierarchical index is generated).
# TODO# hist
count, bin_edges = np.histogram(df[['x1', 'x2']])
df[['x1', 'x2']].plot(kind='hist',
xticks=bin_edges,
bins=15,
alpha=0.35,
color=['coral', 'darkslateblue'],
stacked=True, # if more than one xvar
xlim=(xmin, xmax))
# scatter
df.plot(x='xvar', y='yvar', kind='scatter'
alpha=0.5, # transparency
color='green',
s=10, # size weight (can be a list)
xlim=(0, 100))
# line
df[['y1', 'y2']].plot(kind='line') # index is x-axis
# area (assumes wide format, x = index)
df.plot(kind='area',
stacked=False,
alpha=0.35,
figsize=(20, 10))| method | description |
|---|---|
| bar | vertical bar plots |
| barh | horizontal bar plots |
| hist | histogram |
| box | boxplot |
| kde or density | density plots |
| area | area plots |
| pie | pie plots |
| scatter | scatter plots |
| hexbin | hexbin plot |
annotations
# using plt
plt.title('title')
plt.ylabel('ylabel')
plt.xlabel('xlabel')
# using ax
ax = df.plot(kind='asdf')
ax.set_title('title')
ax.set_ylabel('ylabel')
ax.set_xlabel('xlabel')defaults:
* axis=0
* inplace=False
df.rename(dict(r1='r99')) # maps rows by default
df.rename(columns=dict(c1='c999'))
df.rename(mapper=dict(c1='c999'), axis=1)df['c1'].replace(np.nan, df['c1'].mean())
df['Gender'].replace(to_replace=['male','female'], value=[0,1], inplace=True)Add/replace axis name.
df.rename_axis('new_index_name')
df.rename_axis('new_index_name', inplace=True)
df.rename_axis('new_column_name', axis=1)
df.rename_axis('new_column_name', axis=1, inplace=True) Set the index to 0, 1, 2, …
df.reset_index() # adds the old index as a new column
df.reset_index(inplace=True)
df.reset_index(drop=True) # doesn't add the old index as a new column
df.reset_index(drop=True, inplace=True)Use a column/columns as the index
df.set_index('c1') # destroys the old index
df.set_index('c1', drop=False) # doesn't destroy the old index
df.set_index('c1', inplace=True)multi-level indexing:
df.set_index([df.index, 'c1'])
df.set_index(['c1', 'c2'])df.sort_values(by='c1')
df.sort_values(by='c1', ascending=False)
df.sort_values(by='c1', ascending=False, na_position='first')
df.sort_values(by='c1', ascending=False, na_position='first', inplace=True)Multiple columns
df.sort_values(['c1', 'c2'], ascending=[True, False])import pandas as pd
df = pd.DataFrame(dict(c1 = ['1,2,3', '4,5,6', '7,8,9']))
df
# c1
# 0 1,2,3
# 1 4,5,6
# 2 7,8,9
df['c1'].str.split(',', expand=True)
# 0 1 2
# 0 1 2 3
# 1 4 5 6
# 2 7 8 9# TODOdf.to_string(float_format='{:.6f}'.format, # e.g. print floats using 6dp
index=False,
header=False)cols to rows and rows to cols
df.transpose() # method
df.T # an attributedf['c1'].unique()df['c1'].value_counts()
df['c1'].value_counts(normalize=True)
df['c1'].value_counts().to_frame() # convert series to dataframedf.where(condition) # replace with NaN where condition is False
df.where(df['c1'] > 10)
df.where(df['c1'] > 10, inplace=True)melt.# TODOpd.read_csv("file.csv")
pd.read_csv('file.csv', skiprows=1) # ignore first row and use row 2 for column names
pd.read_csv("file.csv", header=None, names=['c1', 'c2', 'c3'])
pd.read_csv('file.csv', index_col = 0) # use first col as index# TODOdf = pd.read_json("file.json")df = pd.read_sql(query, con)pip install xlrddf = pd.read_excel("file.xlsx",
sheet_name='Canada by Citizenship',
skiprows=range(20),
skipfooter=2)df.to_csv("file.csv")df.to_json("file.json")df.to_sql("file.sql")df.to_excel("file.xlsx")with pd.option_context(
):
print(df)
# TODOCreate from scratch, lists, NumPy arrays, dicts, series or other data frames
.shape, len(), .index, .count, .columns, .columns.values
.loc, .iloc, .at, .iat, .ix
set_index to use a column as an index
use column as the index df['colname'] = df.index
the general recommendation is that you use .loc to insert rows in your DataFrame
drop
read_csv, parse_dates
.to_csv, .to_excel
pivot, pivot_table, stack, unstack, melt
.iterrows
drop_cols=['thal','thalach']
heart_Pr=pd.read_csv('heart.csv',usecols=lambda cc : cc not in drop_cols, index_col=False)```
import pandas as pdheart_P=pd.read_csv('heart.csv')#print first rowsprint(heart_P.head(5))#Print some basic info about the dataframeheart_P.describe()#Count grouped casesheart_P.groupby('fbs').count()#Count the mean of grouped casesheart_P.groupby(['sex','fbs']).count()heart_P.groupby(['sex','fbs']).mean()
#Select first row. Returns Series.print(ChildAgeFrame.iloc[0])#Select first row. Returns Dataframe.print(ChildAgeFrame.iloc[[0]])#Select first columnprint(ChildAgeFrame.iloc[:,0])#Select last columnprint(ChildAgeFrame.iloc[:,-1])#Select first elementprint(ChildAgeFrame.iloc[0,0])#Select first 2 rowsprint(ChildAgeFrame.iloc[0:2])#Select first 2 columnsprint(ChildAgeFrame.iloc[:,0:2])
#Select columns using nameprint(heart_P[['chol','fbs']])#Select row using regular expressionsdf=heart_P.filter(regex='t.+l',axis=1)#Select row using logical conditiondf=heart_P[(heart_P['chol']> 200) & (heart_P['thalach']==178)]
HotelFrame = pd.DataFrame({'Room Type': ['Regular','Suite', 'Regular','Lux Suite','Suite'],'Floor':['1','2','3','3','2']})pets=['No','Yes','No','No','Yes']HotelFrame['pet']=pets#Using a map to implement logicpets_allowed={'Regular':'No','Suite':'Yes','Lux Suite':'No'}#Add column using the above mapHotelFrame['PETS']=HotelFrame['Room Type'].map(pets_allowed)#ADD a ROWdfadd=HotelFrame.append({'Room Type':'Regular','Floor':'1','pet':'No','PETS':'No'}, ignore_index=True)print(dfadd)
ChildAgeFrame = pd.DataFrame(np.array(([15, 0], [69,3],[35,2])), columns=['age', 'children'], index=['Tes', 'Linda', 'Kate'])#Compute the mean of the columnsprint(ChildAgeFrame.apply(np.mean,axis=0))#Apply a lambda function to all elementsprint(ChildAgeFrame.apply(lambda x: x*5))